// Top Secret Crypto Gold for Windows
//...................................

// Copyright  2000 - 2005 by TAN$TAAFL Software Company
//						      14 Foster St., Banician
//                            Olongapo City 2200
//                            Philippines

// This source code is NOT IN THE PUBLIC DOMAIN and is NOT OPEN SOURCE.
// It is provided solely for the purpose of letting you determine how
// the program works, and that there are no backdoors or hidden code
// in the program. Anyone that wants to use any portion of this code
// in their own program please contact the author at:

//							  MacGregor K. Phillips
//                            PSC 517 Box RS
//                            FPO AP 96517-1000

// C source code for multiprecision arithmetic library routines.

// These routines implement all of the multiprecision arithmetic
// necessary for number-theoretic cryptographic algorithms such as
// ElGamal, Diffie-Hellman, Rabin, or factoring studies for large
// composite numbers, as well as Rivest-Shamir-Adleman (RSA) public
// key cryptography.

// Although originally developed in Microsoft C for the IBM PC, this code
// contains few machine dependencies.  It assumes 2's complement
// arithmetic.  It can be adapted to 8-bit, 16-bit, or 32-bit machines,
// lowbyte-highbyte order or highbyte-lowbyte order.  This version
// has been converted to ANSI C.

// The internal representation for these extended precision integer
// "registers" is an array of "units".  A unit is a machine word, which
// is either an 8-bit byte, a 16-bit unsigned integer, or a 32-bit
// unsigned integer, depending on the machine's word size.  For example,
// an IBM PC or AT uses a unit size of 16 bits.  To perform arithmetic
// on these huge precision integers, we pass pointers to these unit
// arrays to various subroutines.  A pointer to an array of units is of
// type unitptr.  This is a pointer to a huge integer "register".

// When calling a subroutine, we always pass a pointer to the BEGINNING
// of the array of units, regardless of the byte order of the machine.
// On a lowbyte-first machine, such as the Intel 80x86, this unitptr
// points to the LEAST significant unit, and the array of units increases
// significance to the right.  On a highbyte-first machine, such as the
// Motorola 680x0, this unitptr points to the MOST significant unit, and
// the array of units decreases significance to the right.
//.......................................................................
#include <windows.h>  
#include "Tsc.h"
#include "ContextHelp.h"
#include "Prototypes.h"
#include <Shlwapi.h>
#include <Commctrl.h>
#include <htmlhelp.h>
#include <shellapi.h>
#include <shlobj.h>
#include "Tscmsg.h"
#include "Check.h"
#include "mpilib.h"
#define STRSAFE_LIB

extern	BOOL		bCancelOperation;
extern	DWORD		dwU_Bytes;
extern	HWND		hMainWindow;

// For Win32 we want this to be 32-bit, for compatibility with the assembler code.
// units of precision for all routines.
//................................................................................
unsigned int global_precision = 0;     

// global_precision is the unit precision last set by set_precision.
// Initially, set_precision() should be called to define global_precision
// before using any of these other multiprecision library routines.
//   i.e.:   set_precision(MAX_UNIT_PRECISION);

// Multiprecision library primitives.
// The following portable C primitives should be recoded in assembly.
// The entry point name should be defined, in "mpilib.h" to the external
// entry point name. If undefined, the C version will be used.

typedef unsigned long int ulint;
 
// multiprecision add with carry r2 to r1, result in r1.
// carry is incoming carry flag-- value should be 0 or 1.
//.......................................................
boolean mp_addc(register unitptr r1, register unitptr r2, register boolean carry)
{
	__asm
	{
		mov		edi,DWORD PTR [global_precision]
		mov		ecx,DWORD PTR [r1]
		mov		edx,DWORD PTR [r2]
		xor		esi,esi

		// Setup carry.
		// Note that the instruction above clears the carry.
		//..................................................
		mov		eax,DWORD PTR [carry]
		rcr		eax,1

loop_t3:
		mov		eax,DWORD PTR [ecx + esi * 4]
		mov		ebx,DWORD PTR [edx + esi * 4]
		adc		eax,ebx
		mov		DWORD PTR [ecx + esi * 4],eax
		inc		esi
		dec		edi
		jnz		loop_t3

		// Compute carry.
		//...............
		rcl		eax,1
		and		eax,1
	}
}

// Multiprecision subtract with borrow, r2 from r1, result in r1.
// Borrow is incoming borrow flag-- value should be 0 or 1.
//...............................................................
boolean mp_subb(register unitptr r1, register unitptr r2, register boolean borrow)
{
	__asm
	{
		mov		edi,DWORD PTR [global_precision]
		mov		ecx,DWORD PTR [r1]
		mov		edx,DWORD PTR [r2]
		xor		esi,esi

		// Setup carry.
		// Note that the instruction above clears the carry.
		//..................................................
		mov		eax,DWORD PTR [borrow]
		rcr		eax,1

	loop_t3:	
		mov		eax,DWORD PTR [ecx + esi * 4]
		mov		ebx,DWORD PTR [edx + esi * 4]
		sbb		eax,ebx
		mov		DWORD PTR [ecx + esi * 4],eax
		inc		esi
		dec		edi
		jnz		loop_t3

		// Compute carry.
		//...............
		rcl		eax,1
		and		eax,1
	}
}

// Multiprecision rotate left 1 bit with carry, result in r1.
// Carry is incoming carry flag - value should be 0 or 1.
//...........................................................
boolean mp_rotate_left(register unitptr r1, register boolean carry)
{
	__asm
	{
		mov		edi,DWORD PTR [global_precision]
		mov		ecx,DWORD PTR [r1]
		xor		esi,esi

		// Setup carry.
		// Note that the instruction above clears the carry
		//.................................................
		mov		eax,DWORD PTR [carry]
		rcr		eax,1

	loop_t3:
		mov		eax,DWORD PTR [ecx + esi * 4]
		rcl		eax,1
		mov		DWORD PTR [ecx + esi * 4],eax
		inc		esi
		dec		edi
		jnz		loop_t3

		// Compute carry.
		//...............
		rcl		eax,1
		and		eax,1
	}
}

// The mp_shift_right_bits function is not called in any time-critical
// situations in public-key cryptographic functions, so it doesn't
// need to be coded in assembly language.

// multiprecision shift right bits, result in r1.
// bits is how many bits to shift, must be <= UNITSIZE.
//.....................................................
void mp_shift_right_bits(register unitptr r1, register int bits)
{
    unsigned int	precision;	// number of units to shift.
    register unit	carry, nextcarry, bitmask;
    int				unbits;

    if (bits == 0)
	{
		return;						// shift zero bits is a no-op.
	}
    carry = 0;
    bitmask = power_of_2(bits) - 1;
    unbits = UNITSIZE - bits;		// shift bits must be <= UNITSIZE.
    precision = global_precision;
    make_msbptr(r1, precision);

    if (bits == UNITSIZE) 
	{
		while (precision--) 
		{
			nextcarry = *r1;
			*r1 = carry;
			carry = nextcarry;
			pre_lowerunit(r1);
		}
    } 
	else 
	{
		while (precision--) 
		{
			nextcarry = *r1 & bitmask;
			*r1 >>= bits;
			*r1 |= carry << unbits;
			carry = nextcarry;
			pre_lowerunit(r1);
		}
    }
}

// Compares multiprecision integers *r1, *r2, and returns:
// -1 iff *r1 < *r2
//  0 iff *r1 == *r2
// +1 iff *r1 > *r2
//........................................................
short mp_compare(register unitptr r1, register unitptr r2)
{
    unsigned int	 precision;	// number of units to compare.

    precision = global_precision;
    make_msbptr(r1, precision);
    make_msbptr(r2, precision);
    do 
	{
		if (*r1 < *r2)
		{
			return -1;
		}
		if (*post_lowerunit(r1) > *post_lowerunit(r2))
		{
			return 1;
		}
    } while (--precision);
    return 0;
}

// Increment multiprecision integer r.
//....................................
boolean mp_inc(register unitptr r)
{
    unsigned int	 precision;

    precision = global_precision;
    make_lsbptr(r, precision);
    do 
	{
		if (++(*r))
		{
			return 0;		// no carry.
		}
		post_higherunit(r);
    } while (--precision);
    return 1;				// carry set.
}

// Decrement multiprecision integer r.
//....................................
boolean mp_dec(register unitptr r)
{
    unsigned int	 precision;

    precision = global_precision;
    make_lsbptr(r, precision);
    do 
	{
		if ((signedunit) (--(*r)) != (signedunit) - 1)
		{
			return 0;		// no borrow.
		}
		post_higherunit(r);
    } while (--precision);
    return 1;				// return borrow set.
}

#ifndef mp_move

// Move one area of memory to another.
//....................................
void mp_move(register unitptr dst, register unitptr src)
{
    unsigned int	 precision;	// number of units to move.

    precision = global_precision;
    do 
	{
		*dst++ = *src++;
    } while (--precision);
}
#endif

// Init multiprecision register r with short value.
//.................................................
void mp_init(register unitptr r, word32 value)
{	
	// Note that mp_init doesn't extend sign bit for > 32767.

    unitfill0(r, global_precision);
    make_lsbptr(r, global_precision);
    *post_higherunit(r) = value;
}

// Returns number of significant units in r.
//..........................................
short significance(register unitptr r)
{
    unsigned int precision;

    precision = global_precision;
    make_msbptr(r, precision);
    do 
	{
		if (*post_lowerunit(r))
		{
			return precision;
		}
    } while (--precision);
    return precision;
}

// Zero-fill the unit buffer r.
//.............................
void unitfill0(unitptr r, word32 unitcount)
{
    while (unitcount--)
	*r++ = 0;
}

// Unsigned divide, treats both operands as positive.
//...................................................
int mp_udiv(register unitptr remainder, register unitptr quotient,
			register unitptr dividend, register unitptr divisor)
{
    int				bits;
    long			dprec;
    register unit	bitmask;

    if (testeq(divisor, 0))
	{
		return -1;			// zero divisor means divide error.
	}
    mp_init0(remainder);
    mp_init0(quotient);

    // Normalize and compute number of bits in dividend first.
	//........................................................
    init_bitsniffer(dividend, bitmask, dprec, bits);

    // Rescale quotient to same precision (dprec) as dividend.
	//........................................................
    rescale(quotient, global_precision, dprec);
    make_msbptr(quotient, dprec);

    while (bits--) 
	{
		CheckForMessages();
		if (bCancelOperation)
		{
			return -15;
		}
		mp_rotate_left(remainder,(boolean) (sniff_bit(dividend, bitmask) != 0));

		if (mp_compare(remainder, divisor) >= 0) 
		{
			mp_sub(remainder, divisor);
			stuff_bit(quotient, bitmask);
		}
		bump_2bitsniffers(dividend, quotient, bitmask);
    }
    return 0;
}

#define RECIPMARGIN 0		// extra margin bits used by mp_recip().

// Compute reciprocal (quotient) as 1/divisor.  Used by faster modmult.
//.....................................................................
int mp_recip(register unitptr quotient, register unitptr divisor)
{
    int				bits;
    long			qprec;
    register unit	bitmask;
    unit			remainder[MAX_UNIT_PRECISION];

    if (testeq(divisor, 0))
	{
		return -1;		// zero divisor means divide error.
	}
    mp_init0(remainder);
    mp_init0(quotient);

    // Normalize and compute number of bits in quotient first.
	//........................................................
    bits = countbits(divisor) + RECIPMARGIN;
    bitmask = bitmsk(bits);		// bitmask within a single unit.
    qprec = bits2units(bits + 1);
    mp_setbit(remainder, (bits - RECIPMARGIN) - 1);

    // rescale quotient to precision of divisor + RECIPMARGIN bits.
	//.............................................................
    rescale(quotient, global_precision, qprec);
    make_msbptr(quotient, qprec);

    while (bits--) 
	{
		CheckForMessages();
		if (bCancelOperation)
		{
			return -15;
		}
		mp_shift_left(remainder);
		if (mp_compare(remainder, divisor) >= 0) 
		{
			mp_sub(remainder, divisor);
			stuff_bit(quotient, bitmask);
		}
		bump_bitsniffer(quotient, bitmask);
    }
    mp_init0(remainder);	// burn sensitive data left on stack.
    return 0;
}

// Unsigned divide, treats both operands as positive.
//...................................................
int mp_mod(register unitptr remainder, register unitptr dividend, 
		   register unitptr divisor)
{
    int				bits;
    long			dprec;
    register unit	bitmask;

    if (testeq(divisor, 0))
	{
		return -1;		// zero divisor means divide error/
	}
    mp_init0(remainder);

    // Normalize and compute number of bits in dividend first.
	//........................................................
    init_bitsniffer(dividend, bitmask, dprec, bits);

    while (bits--) 
	{
		CheckForMessages();
		if (bCancelOperation)
		{
			return -15;
		}
		mp_rotate_left(remainder,(boolean)(sniff_bit(dividend,bitmask) != 0));
		msub(remainder, divisor);
		bump_bitsniffer(dividend, bitmask);
    }
    return 0;
}

// Computes multiprecision prod = multiplicand * multiplier.
// Uses "Russian peasant" multiply algorithm.
//..........................................................
int mp_mult(register unitptr prod, register unitptr multiplicand, 
			register unitptr multiplier)
{
    int				bits;
    register unit	bitmask;
    long			mprec;

    mp_init(prod, 0);
    if (testeq(multiplicand, 0))
	{
		return 0;		// zero multiplicand means zero product.
	}
    // Normalize and compute number of bits in multiplier first.
	//..........................................................
    init_bitsniffer(multiplier, bitmask, mprec, bits);

    while (bits--) 
	{
		CheckForMessages();
		if (bCancelOperation)
		{
			return -15;
		}
		mp_shift_left(prod);
		if (sniff_bit(multiplier,bitmask))
		{
			mp_add(prod,multiplicand);
		}
		bump_bitsniffer(multiplier,bitmask);
    }
    return 0;
}

// mp_modmult computes a multiprecision multiply combined with a
// modulo operation.  This is the most time-critical function in
// this multiprecision arithmetic library for performing modulo
// exponentiation.  We experimented with different versions of modmult,
// depending on the machine architecture and performance requirements.
// We will either use a Russian Peasant modmult (peasant_modmult), 
// Charlie Merritt's modmult (merritt_modmult), Jimmy Upton's
// modmult (upton_modmult), or Thad Smith's modmult (smith_modmult).
// On machines with a hardware atomic multiply instruction,
// Smith's modmult is fastest.  It can utilize assembly subroutines to
// speed up the hardware multiply logic and trial quotient calculation.
// If the machine lacks a fast hardware multiply, Merritt's modmult
// is preferred, which doesn't call any assembly multiply routine.
// We use the alias names mp_modmult, stage_modulus, and modmult_burn
// for the corresponding true names, which depend on what flavor of
// modmult we are using.

// Before making the first call to mp_modmult, you must set up the
// modulus-dependant precomputated tables by calling stage_modulus.
// After making all the calls to mp_modmult, you call modmult_burn to
// erase the tables created by stage_modulus that were left in memory.
//......................................................................

// Used by Upton's and Smith's modmult algorithms.
//................................................
long munit_prec;		// global_precision expressed in MULTUNITs

#pragma warning(disable : 4731)

void mp_smul(MULTUNIT * prod, MULTUNIT * multiplicand, MULTUNIT multiplier)
{
	__asm
	{
		mov		ecx,DWORD PTR [global_precision]
		mov		edi,DWORD PTR [prod]
		mov		esi,DWORD PTR [multiplicand]
		push	ebp
		mov		ebp,DWORD PTR [multiplier]
		xor		ebx,ebx

	loop_t3:
		mov		eax,DWORD PTR [esi]
		mul		ebp
		add		eax,ebx
		adc		edx,0
		add		eax,DWORD PTR [edi]
		adc		edx,0
		mov		DWORD PTR [edi],eax
		mov		ebx,edx
		add		esi,4
		add		edi,4
		dec		ecx
		jnz		loop_t3

		add		DWORD PTR [edi],ebx
		pop		ebp
	}
}

#pragma warning(default : 4731)

/*
 * mp_dmul is a double-precision multiply multiplicand times multiplier,
 * result into prod.  prod must be pointing at a "double precision"
 * buffer.  E.g. If global_precision is 10 words, prod must be
 * pointing at a 20-word buffer.
 */
#ifndef mp_dmul
void mp_dmul(unitptr prod, unitptr multiplicand, unitptr multiplier)
{
    register int		i;
    register MULTUNIT	*p_multiplicand, *p_multiplier;
    register MULTUNIT	*prodp;

    unitfill0(prod, global_precision * 2);	// Pre-zero prod.

    // Calculate precision in units of MULTUNIT.
	//..........................................
    munit_prec = global_precision * UNITSIZE / MULTUNITSIZE;
    p_multiplicand = (MULTUNIT *) multiplicand;
    p_multiplier = (MULTUNIT *) multiplier;
    prodp = (MULTUNIT *) prod;
    make_lsbptr(p_multiplicand,munit_prec);
    make_lsbptr(p_multiplier,munit_prec);
    make_lsbptr(prodp, munit_prec * 2);

    // Multiply multiplicand by each word in multiplier, accumulating prod.
	//.....................................................................
    for (i = 0; i < munit_prec; ++i)
	{
		CheckForMessages();
		if (bCancelOperation)
		{
			return;
		}
		mp_smul(post_higherunit(prodp),p_multiplicand,*post_higherunit(p_multiplier));
	}
}
#endif

static unit		modulus[MAX_UNIT_PRECISION];
static long		nbits;		// number of modulus significant bits.

// These scratchpad arrays are used only by upton_modmult (mp_modmult).
// Some of them could be staticly declared inside of mp_modmult, but we
// put them outside mp_modmult so that they can be wiped clean by
// modmult_burn(), which is called at the end of mp_modexp.  This is
// so that no sensitive data is left in memory after the program exits.
//......................................................................
static unit reciprocal[MAX_UNIT_PRECISION];
static unit dhi[MAX_UNIT_PRECISION];
static unit d_data[MAX_UNIT_PRECISION * 2];
static unit e_data[MAX_UNIT_PRECISION * 2];
static unit f_data[MAX_UNIT_PRECISION * 2];

static long nbitsDivUNITSIZE;
static long nbitsModUNITSIZE;

// stage_upton_modulus() is aliased to stage_modulus().
// Prepare for an Upton modmult.  Calculate the reciprocal of modulus,
// and save both.  Note that reciprocal will have one more bit than
// modulus.
// Assumes that global_precision has already been adjusted to the
// size of the modulus, plus SLOP_BITS.
//....................................................................
int stage_upton_modulus(unitptr n)
{
    mp_move(modulus, n);
    mp_recip(reciprocal, modulus);
    nbits = countbits(modulus);
    nbitsDivUNITSIZE = nbits / UNITSIZE;
    nbitsModUNITSIZE = nbits % UNITSIZE;
    return 0;
}

// Upton's algorithm performs a multiply combined with a modulo operation.
// Computes:  prod = (multiplicand*multiplier) mod modulus
// WARNING: All the arguments must be less than the modulus!
// References global unitptr modulus and reciprocal.
// The reciprocal of modulus is 1 bit longer than the modulus.
// upton_modmult() is aliased to mp_modmult().
//.......................................................................
int upton_modmult(unitptr prod, unitptr multiplicand, unitptr multiplier)
{
	unsigned int	orig_precision;
    unitptr d = d_data;
    unitptr d1 = d_data;
    unitptr e = e_data;
    unitptr f = f_data;
    
    orig_precision = global_precision;
    mp_dmul(d, multiplicand, multiplier);

    // Throw off low nbits of d.
	//..........................
    d1 = d + nbitsDivUNITSIZE;

    mp_move(dhi, d1);				// Don't screw up d, we need it later.
    mp_shift_right_bits(dhi, nbitsModUNITSIZE);
    mp_dmul(e, dhi, reciprocal);	// Note - reciprocal has nbits+1 bits.

	EmptyTheMessageQue();
	if (bCancelOperation)
	{
		return -15;
	}
    e += nbitsDivUNITSIZE;
    mp_shift_right_bits(e, nbitsModUNITSIZE);
    mp_dmul(f, e, modulus);

	EmptyTheMessageQue();
	if (bCancelOperation)
	{
		return -15;
	}
    //Now for the only double-precision call to mpilib.
	//.................................................
    set_precision(orig_precision * 2);
    mp_sub(d, f);

    // d's precision should be <= orig_precision.
	//...........................................
    rescale(d, orig_precision * 2, orig_precision);
    set_precision(orig_precision);

    // Should never have to do this final subtract more than twice.
	//.............................................................
    while (mp_compare(d, modulus) > 0)
	{
		mp_sub(d, modulus);
	}
    mp_move(prod, d);
    return 0;
}

// Upton's mp_modmult function leaves some internal arrays in memory,
// so we have to call modmult_burn() at the end of mp_modexp.
// This is so that no cryptographically sensitive data is left in memory
// after the program exits.
// upton_burn() is aliased to modmult_burn().
//......................................................................
void upton_burn(void)
{
    unitfill0(modulus, MAX_UNIT_PRECISION);
    unitfill0(reciprocal, MAX_UNIT_PRECISION);
    unitfill0(dhi, MAX_UNIT_PRECISION);
    unitfill0(d_data, MAX_UNIT_PRECISION * 2);
    unitfill0(e_data, MAX_UNIT_PRECISION * 2);
    unitfill0(f_data, MAX_UNIT_PRECISION * 2);
    nbits = nbitsDivUNITSIZE = nbitsModUNITSIZE = 0;
}

// Returns number of significant bits in r.
//.........................................
int countbits(unitptr r)
{
    int				bits;
    int				prec;
    register unit	bitmask;

    init_bitsniffer(r, bitmask, prec, bits);
    return bits;
}

// Russian peasant combined exponentiation/modulo algorithm.
// Calls modmult instead of mult.
// Computes:  expout = (expin**exponent) mod modulus
// WARNING: All the arguments must be less than the modulus!
//..........................................................
int mp_modexp(register unitptr expout, register unitptr expin,
			  register unitptr exponent, register unitptr modulus)
{
    int				bits;
    unsigned int	oldprecision;
    register unit	bitmask;
    unit			product[MAX_UNIT_PRECISION];
    unsigned int	eprec;

    mp_init(expout, 1);
    if (testeq(exponent, 0)) 
	{
		if (testeq(expin, 0))
		{
			return -1;		// 0 to the 0th power means return error.
		}
		return 0;			// otherwise, zero exponent means expout is 1.
    }

    if (testeq(modulus, 0))
	{
		return -2;			// zero modulus means error.
	}

	#if SLOP_BITS > 0		// if there's room for sign bits.
	if (mp_tstminus(modulus))
	{
		return -2;			// negative modulus means error.
	}
	#endif

    if (mp_compare(expin, modulus) >= 0)
	{
		return -3;			// if expin >= modulus, return error.
	}
    if (mp_compare(exponent, modulus) >= 0)
	{
		return -4;			// if exponent >= modulus, return error.
	}

    oldprecision = global_precision;	// save global_precision.

    // Set smallest optimum precision for this modulus.
	//.................................................
    set_precision(bits2units(countbits(modulus) + SLOP_BITS));

    // Rescale all these registers to global_precision we just defined.
	//.................................................................
    rescale(modulus, oldprecision, global_precision);
    rescale(expin, oldprecision, global_precision);
    rescale(exponent, oldprecision, global_precision);
    rescale(expout, oldprecision, global_precision);

    if (stage_modulus(modulus)) 
	{
		set_precision(oldprecision);	// restore original precision.
		return -5;						// unstageable modulus (STEWART algorithm).
    }
    // normalize and compute number of bits in exponent first.
	//........................................................
    init_bitsniffer(exponent, bitmask, eprec, bits);

    // We can "optimize out" the first modsquare and modmult.
	// We know for sure at this point that bits > 0.
	//.......................................................
    bits--;
    mp_move(expout, expin);				//  expout = (1*1)*expin.
    bump_bitsniffer(exponent, bitmask);

    while (bits--) 
	{
		CheckForMessages();
		if (bCancelOperation)
		{
			return -15;
		}
		mp_modsquare(product, expout);
		if (sniff_bit(exponent, bitmask)) 
		{
			mp_modmult(expout, product, expin);
		} 
		else 
		{
			mp_move(expout, product);
		}
		bump_bitsniffer(exponent, bitmask);
    }
    mp_burn(product);					// burn the evidence on the stack.
    modmult_burn();						// ask mp_modmult to also burn its own evidence.

    set_precision(oldprecision);		// restore original precision.
    return 0;
}

// This is a faster modexp for moduli with a known factorisation into two
// relatively prime factors p and q, and an input relatively prime to the
// modulus, the Chinese Remainder Theorem to do the computation mod p and
// mod q, and then combine the results.  This relies on a number of
// precomputed values, but does not actually require the modulus n or the
// exponent e.
 
// expout = expin ^ e mod (p*q).
// We form this by evaluating
// p2 = (expin ^ e) mod p and
// q2 = (expin ^ e) mod q
// and then combining the two by the CRT.
 
// Two optimisations of this are possible.  First, we can reduce expin
// modulo p and q before starting.
 
// Second, since we know the factorisation of p and q (trivially derived
// from the factorisation of n = p*q), and expin is relatively prime to
// both p and q, we can use Euler's theorem, expin^phi(m) = 1 (mod m),
// to throw away multiples of phi(p) or phi(q) in e.
// Letting ep = e mod phi(p) and
//         eq = e mod phi(q)
// then combining these two speedups, we only need to evaluate
// p2 = ((expin mod p) ^ ep) mod p and
// q2 = ((expin mod q) ^ eq) mod q.
 
// Now we need to apply the CRT.  Starting with
// expout = p2 (mod p) and
// expout = q2 (mod q)
// we can say that expout = p2 + p * k, and if we assume that 0 <= p2 < p,
// then 0 <= expout < p*q for some 0 <= k < q.  Since we want expout = q2
// (mod q), then p*k = q2-p2 (mod q).  Since p and q are relatively prime,
// p has a multiplicative inverse u mod q.  In other words, u = 1/p (mod q).
//
// Multiplying by u on both sides gives k = u*(q2-p2) (mod q).
// Since we want 0 <= k < q, we can thus find k as
// k = (u * (q2-p2)) mod q.
 
// Once we have k, evaluating p2 + p * k is easy, and
// that gives us the result.
 
// In the detailed implementation, there is a temporary, temp, used to
// hold intermediate results, p2 is held in expout, and q2 is used as a
// temporary in the final step when it is no longer needed.  With that,
// you should be able to understand the code below.
//.........................................................................
int mp_modexp_crt(unitptr expout, unitptr expin, unitptr p, unitptr q, 
				  unitptr ep, unitptr eq, unitptr u)
{
    unit	q2[MAX_UNIT_PRECISION];
    unit	temp[MAX_UNIT_PRECISION];
    int		status;

	//First, compute p2 (physically held in M)
	// p2 = [ (expin mod p) ^ ep ] mod p.
	//........................................
    mp_mod(temp, expin, p);			// temp = expin mod p.
    status = mp_modexp(expout, temp, ep, p);
    if (status < 0) 
	{	
		// mp_modexp returned an error.
		//.............................
		mp_init(expout, 1);
		return status;
    }
	// And the same thing for q2
	// q2 = [ (expin mod q) ^ eq ] mod q
	//..................................
    mp_mod(temp, expin, q);			// temp = expin mod q.
    status = mp_modexp(q2, temp, eq, q);
    if (status < 0) 
	{		
		// mp_modexp returned an error.
		//.............................
		mp_init(expout, 1);
		return status;
    }
	// Now use the multiplicative inverse u to glue together the
    // two halves.
	//..........................................................

	// Find q2-p2 mod q.
	//..................
	if (mp_sub(q2, expout))
	{
		// If the result went negative add q to q2.
		//.........................................
	    mp_add(q2, q);
	}

	// expout = p2 + ( p * [(q2*u) mod q] )
	//.....................................
	mp_mult(temp, q2, u);			// q2*u
	mp_mod(q2, temp, q);			// (q2*u) mod q
	mp_mult(temp, p, q2);			// p * [(q2*u) mod q]
	mp_add(expout, temp);			// expout = p2 + p * [...]

    mp_burn(q2);
    mp_burn(temp);
    return 0;
}

#define iplus1  ( i==2 ? 0 : i+1 )	/* used by Euclid algorithms */
#define iminus1 ( i==0 ? 2 : i-1 )	/* used by Euclid algorithms */

// Computes greatest common divisor via Euclid's algorithm.
//.........................................................
void mp_gcd(unitptr result, unitptr a, unitptr n)
{
    long	i;
    unit	gcopies[3][MAX_UNIT_PRECISION];

	#define g(i) (&(gcopies[i][0]))

    mp_move(g(0),n);
    mp_move(g(1),a);

    i = 1;
    while (testne(g(i),0)) 
	{
		EmptyTheMessageQue();

		mp_mod(g(iplus1),g(iminus1),g(i));
		i = iplus1;
    }
    mp_move(result,g(iminus1));
    mp_burn(g(iminus1));
    mp_burn(g(iplus1));

	#undef g
}

// Euclid's algorithm extended to compute multiplicative inverse.
// Computes x such that a*x mod n = 1, where 0<a<n
// The variable u is unnecessary for the algorithm, but is 
// included in comments for mathematical clarity. 
//...............................................................
void mp_inv(unitptr x, unitptr a, unitptr n)
{
    long	i;
    unit	y[MAX_UNIT_PRECISION], temp[MAX_UNIT_PRECISION];
    unit	gcopies[3][MAX_UNIT_PRECISION], vcopies[3][MAX_UNIT_PRECISION];

	#define g(i) (  &(gcopies[i][0])  )
	#define v(i) (  &(vcopies[i][0])  )

    mp_move(g(0),n);
    mp_move(g(1),a);

    mp_init(v(0),0);
    mp_init(v(1),1);
    i = 1;

    while (testne(g(i), 0)) 
	{
		EmptyTheMessageQue();

		// We know that at this point,  g(i) = u(i)*n + v(i)*a.
		//.....................................................
		mp_udiv(g(iplus1),y,g(iminus1),g(i));
		mp_mult(temp,y,v(i));
		mp_move(v(iplus1),v(iminus1));
		mp_sub(v(iplus1),temp);
		i = iplus1;
    }

    mp_move(x,v(iminus1));
    if (mp_tstminus(x))
	{
		mp_add(x, n);
	}
    mp_burn(g(iminus1));
    mp_burn(g(iplus1));
    mp_burn(v(0));
    mp_burn(v(1));
    mp_burn(v(2));
    mp_burn(y);
    mp_burn(temp);

	#undef g
	#undef v
}
